Matthew Talluto
28.11.2022
mu = a + b * xy ~ normal(mu, sigma)mu = a + b * xy ~ normal(mu, sigma).stan filerstan package to invoke the Stan interpreter
rstan.data block tells Stan what to expect from all the
data you will pass it
parameters block identifies random
unknowns
transformed parameters does what the name
implies
data or
parameters can be used for transformations// Stan version
data {
int n; // a single integer named n, number of data points
vector [n] y; // a vector of n real numbers, named y
vector [n] x;
}
parameters {
real a;
real b;
real <lower=0> s; // a single real number that must be positive
}
transformed parameters {
vector [n] mu = a * b * x;
// or another example
// vector [n] log_x = log(x);
}~ symbol// Stan version
data {
int n; // a single integer named n, number of data points
vector [n] y; // a vector of n real numbers, named y
vector [n] x;
}
parameters {
real a;
real b;
real <lower=0> s; // a single real number that must be positive
}
transformed parameters {
vector [n] mu = a * b * x;
}
model {
y ~ normal(mu, s);
}~ symbol## R version
log_posterior = function(params, data) {
y = data[['y']]
x = data[['x']]
a = params['a']
b = params['b']
s = params['s']
if(s <= 0)
return(-Inf)
mu = a + b * x
log_lhood = sum(dnorm(y, mu, s, log = TRUE))
log_prior = dnorm(a, 0, 10, log=TRUE) +
dnorm(B, 0, 5, log=TRUE) +
dexp(s, 0.1, log=TRUE)
return(log_lhood + log_prior)
}// Stan version
data {
int n; // a single integer named n, number of data points
vector [n] y; // a vector of n real numbers, named y
vector [n] x;
}
parameters {
real a;
real b;
real <lower=0> s; // a single real number that must be positive
}
transformed parameters {
vector [n] mu = a * b * x;
}
model {
y ~ normal(mu, s);
a ~ normal(0, 10);
b ~ normal(0, 5);
s ~ exponential(0.1);
}
Save this code to a file. For example:
stan/sample_stan_lm.stan
stan_model functionsampling function## R version
# get the data and format it
library(data.table)
Howell1 = fread("https://github.com/rmcelreath/rethinking/raw/master/data/Howell1.csv")
mh_data = list(
y = Howell1$height
x = Howell1$weight
)
# load the metropolis sampler from this course
source("r/mh.r")
# take 10000 samples, with adaptation
samples = metropolis(log_posterior, initial = c(a = 0, b = 0, s = 1),
data = mh_data)## rstan version
library(rstan)
library(data.table)
# get the data and format it for rstan
Howell1 = fread("https://github.com/rmcelreath/rethinking/raw/master/data/Howell1.csv")
stan_data = list(
y = Howell1$height,
x = Howell1$weight,
n = length(Howell1$weight)
)
# compile the model
model = stan_model(file = "stan/sample_stan_lm.stan")
# take 5000 samples
samples = sampling(model, iter=5000, data = standata)Scalar types
intrealvector <constraint> [length] name
vector <constraint> [length] name
matrix <constraint> [rows, columns] name
vector <constraint> [length] name
matrix <constraint> [rows, columns] name
type <constraint> name [dimensions]
s = c(147, 126, 183, 88, 9, 203, 16, 10, 112, 205)
\[ \begin{aligned} s & \sim \mathrm{U}(1, N_{max}) \\ N_{max} & \sim \mathrm{Gamma}(\alpha=0.001, \beta=0.001) \end{aligned} \]
rstan::optimizing to get thiss = c(147, 126, 183, 88, 9, 203, 16, 10, 112, 205)
standat = list(n = length(s), s = s)
fit_map = optimizing(tanks_mod, data = standat)
fit_map$par
## Nmax
## 205As before, the MAP is identical to max(s), but many
other values are possible!
bayesplot package is great for visualising output
from stan, metropolis, and many other Bayesian
packages.# parameter explanations:
# iter: how many iterations (you will get iter/2 samples)
# chains: how many parallel MCMC chains?
# refresh: if 0, stan won't show you a progress bar (nice for rmd files)
fit = sampling(tanks_mod, data = standat, iter = 5000, chains = 1, refresh = 0)
samples = as.matrix(fit)
head(samples)
## parameters
## iterations Nmax lp__
## [1,] 226.1407 -56.75793
## [2,] 226.1407 -56.75793
## [3,] 211.6211 -57.17141
## [4,] 206.6496 -58.29355
## [5,] 206.9084 -58.16191
## [6,] 205.5159 -59.39410
source("../r/metrop.r") ## for HDPI
rbind(hdpi = hpdi(samples[,1]),
quantile = quantile(samples[,1], c(0.05, 0.95)))
## 5% 95%
## hdpi 205.0023 260.3608
## quantile 205.8477 279.2651\[ \mathbb{E}(y) = a + b_1x + b_2x^2 \]
Howell1$age_group = cut(Howell1$age, breaks = c(-1, 12, 22, 200),
labels = c("child", "young", "adult"))
Howell1
## height weight age male age_group
## 1: 151.765 47.825606 63 1 adult
## 2: 139.700 36.485806 63 0 adult
## 3: 136.525 31.864838 65 0 adult
## 4: 156.845 53.041914 41 1 adult
## 5: 145.415 41.276872 51 0 adult
## ---
## 540: 145.415 31.127751 17 1 young
## 541: 162.560 52.163080 31 1 adult
## 542: 156.210 54.062496 21 0 young
## 543: 71.120 8.051258 0 1 child
## 544: 158.750 52.531623 68 1 adultlm and other tools often handle this
automatically.Howell1$is_child = ifelse(Howell1$age_group == "child", 1, 0)
Howell1$is_young = ifelse(Howell1$age_group == "young", 1, 0)
Howell1
## height weight age male age_group is_child is_young
## 1: 151.765 47.825606 63 1 adult 0 0
## 2: 139.700 36.485806 63 0 adult 0 0
## 3: 136.525 31.864838 65 0 adult 0 0
## 4: 156.845 53.041914 41 1 adult 0 0
## 5: 145.415 41.276872 51 0 adult 0 0
## ---
## 540: 145.415 31.127751 17 1 young 0 1
## 541: 162.560 52.163080 31 1 adult 0 0
## 542: 156.210 54.062496 21 0 young 0 1
## 543: 71.120 8.051258 0 1 child 1 0
## 544: 158.750 52.531623 68 1 adult 0 0data {
int n;
vector [n] height;
vector [n] weight;
vector <lower = 0, upper = 1> [n] is_child;
vector <lower = 0, upper = 1> [n] is_young;
}
parameters {
real a; // default intercept (for adults)
real a_child; // additional effect of being a child
real a_young; // additional effect of being young
real b;
real <lower=0> s;
}
transformed parameters {
vector [n] intercept = a + a_child * is_child + a_young * is_young;
vector [n] mu = intercept + b * x;
}
model {
y ~ normal(mu, s);
a ~ normal(0, 10);
a_child ~ normal(0, 10);
a_young ~ normal(0, 10);
b ~ normal(0, 5);
s ~ exponential(0.1);
}data {
int n;
vector [n] height;
vector [n] weight;
vector <lower = 0, upper = 1> [n]is_child;
vector <lower = 0, upper = 1> [n] is_young;
}
parameters {
real a; // default intercept (for adults)
real a_child; // additional effect of being a child
real a_young; // additional effect of being young
real b; // default slope (for adults)
real b_child; // additional slope effect of being a child
real b_young; // additional slope effect of being young
real <lower=0> s;
}
transformed parameters {
vector [n] intercept = a + a_child * is_child + a_young * is_young;
vector [n] slope = b + b_child * is_child + b_young * is_young;
vector [n] mu = intercept + slope * x;
}
model {
y ~ normal(mu, s);
a ~ normal(0, 10);
a_child ~ normal(0, 10);
a_young ~ normal(0, 10);
b ~ normal(0, 5);
b_child ~ normal(0, 5);
b_young ~ normal(0, 5);
s ~ exponential(0.1);
}\[ \mathbb{E}(y) = a + \mathbf{B}\mathbf{X} \]
X = cbind("weight" = Howell1$weight,
"weight^2" = Howell1$weight^2,
"child" = Howell1$is_child,
"young" = Howell1$is_young,
# we need separate variables for interactions
"weight × child" = Howell1$weight * Howell1$is_child,
"weight × young" = Howell1$weight * Howell1$is_young)
head(X)
## weight weight^2 child young weight × child weight × young
## [1,] 47.82561 2287.289 0 0 0 0
## [2,] 36.48581 1331.214 0 0 0 0
## [3,] 31.86484 1015.368 0 0 0 0
## [4,] 53.04191 2813.445 0 0 0 0
## [5,] 41.27687 1703.780 0 0 0 0
## [6,] 62.99259 3968.066 0 0 0 0\[ \begin{aligned} \mathbb{E}(y) & = a + \mathbf{B}\mathbf{X} \\ y & \sim \mathcal{N}(\mathbb{E}(y), s) \end{aligned} \]
\[ \begin{aligned} \mathbb{E}(y) & = a + \mathbf{B}\mathbf{X} \\ \rho & = \mathcal{f}(\mathbb{E}(y), \ldots) \\ y & \sim \mathcal{D}(\rho) \end{aligned} \]
tsuga = readRDS("exercises/data/tsuga.rds")
tsuga[, 4:8]
## born died n annual_mean_temp tot_annual_pp
## 1: 1 0 0 3.414667 1085.2000
## 2: 0 3 5 3.849333 1003.0000
## 3: 3 0 6 3.452000 1076.1333
## 4: 1 0 3 3.620000 1099.4000
## 5: 0 4 4 4.596000 989.4667
## ---
## 1538: 0 1 1 4.329333 1234.0000
## 1539: 1 0 3 4.134000 1191.0000
## 1540: 0 1 4 4.372000 1207.2667
## 1541: 0 1 2 4.247333 1220.2667
## 1542: 0 1 4 4.283333 1200.9333\[ \begin{aligned} \mathbb{E}(n_{died}/n_{total}) &= p = a + bT \\ n_{died} & \sim \mathrm{Binomial}(n_{total}, p) \end{aligned} \]
data {
int n; // number of plots
int n_died [n]; // number of dead trees in each plot
int n_total [n]; // number of trees in each plot
vector [n] annual_mean_temp; // temperature
}
parameters {
real a;
real b;
// no s: there is no variance parameter
// for the binomial distribution!
}
transformed parameters {
vector [n] p = a + b * annual_mean_temp;
}
model {
n_died ~ binomial(n_total, p);
a ~ normal(0, 10);
b ~ normal(0, 5);
}log_posterior = function(par, dat) {
# the probability of death changes with temperature
p = par['a'] + par['b'] * dat$annual_mean_temp
# likelihood
logpr = sum(dbinom(dat$died, dat$n, p, log=TRUE))
# priors
logpr + dnorm(par['a'], 0, 20, log=TRUE) +
dnorm(par['b'], 0, 5, log=TRUE)
}
fit = optim(c(a=0.5, b=0), log_posterior, dat = tsuga[n > 0],
method = "Nelder-Mead", control=list(fnscale=-1))
## Warning in dbinom(dat$died, dat$n, p, log = TRUE): NaNs produced
## Warning in dbinom(dat$died, dat$n, p, log = TRUE): NaNs produced
## Warning in dbinom(dat$died, dat$n, p, log = TRUE): NaNs produced
## Warning in dbinom(dat$died, dat$n, p, log = TRUE): NaNs produced
## Warning in dbinom(dat$died, dat$n, p, log = TRUE): NaNs produced
## Warning in dbinom(dat$died, dat$n, p, log = TRUE): NaNs produced
## Warning in dbinom(dat$died, dat$n, p, log = TRUE): NaNs produced
## Warning in dbinom(dat$died, dat$n, p, log = TRUE): NaNs produced
## Warning in dbinom(dat$died, dat$n, p, log = TRUE): NaNs produced
## Warning in dbinom(dat$died, dat$n, p, log = TRUE): NaNs produced
## Warning in dbinom(dat$died, dat$n, p, log = TRUE): NaNs produced\[ \begin{aligned} \log \frac{p}{1-p} & = a + \mathbf{BX} \\ \mathrm{logit} (p) & = a + \mathbf{BX} \\ p & = \mathrm{logit}^{-1} (a + \mathbf{BX}) \\ p & = \frac{e^{a + \mathbf{BX}}}{1 + e^{a + \mathbf{BX}}} \end{aligned} \]
p = plogis(a + X %*% B)p = inv_logit(a + X * B) or use the
binomial_logit function// saved in file stan/tree_binom.stan
data {
int n; // number of plots
int n_died [n]; // number of dead trees in each plot
int n_total [n]; // number of trees in each plot
vector [n] annual_mean_temp; // temperature
}
parameters {
real a;
real b;
// no s: there is no variance parameter
// for the binomial distribution!
}
transformed parameters {
vector [n] p = inv_logit(a + b * annual_mean_temp);
}
model {
n_died ~ binomial(n_total, p);
// priors are the same regardless
a ~ normal(0, 10);
b ~ normal(0, 5);
}log_posterior = function(par, dat) {
# the probability of death changes with temperature
p = plogis(par['a'] + par['b'] * dat$annual_mean_temp)
# likelihood
logpr = sum(dbinom(dat$died, dat$n, p, log=TRUE))
# priors
logpr + dnorm(par['a'], 0, 20, log=TRUE) +
dnorm(par['b'], 0, 5, log=TRUE)
}
fit = optim(c(a=0.5, b=0), log_posterior, dat = tsuga[n > 0],
method = "Nelder-Mead", control=list(fnscale=-1),
hessian = TRUE)
vcv = solve(-fit$hessian)
fit$par
## a b
## -1.0266212 -0.1291986\[ \begin{aligned} \mathbb{E}(y) & = \mathrm{L}^{-1}\left (a + \mathbf{B}\mathbf{X} \right )\\ \rho & = \mathcal{f}(\mathbb{E}(y), \ldots) \\ y & \sim \mathcal{D}(\rho) \end{aligned} \]
Many distributions have so-called canonical link functions
mu = a + X %*% Bp = plogis(a + X %*% B)lambda = exp(a + X %*% B)lambda = 1/(a + X %*% B)Other distributions have typical link functions, but need reparameterization
mu = exp(a + X %*% B)mu = pnorm(a + X %*% B)mu = 1/(a + X %*% B)\[ \begin{aligned} \lambda & = \exp(a + \mathbf{BX}) \\ y & \sim \mathrm{Poisson}(\lambda) \end{aligned} \]
Constraint: \(\sigma^2_{y|\mathbf{X}} = \mathbb{E}(y|\mathbf{X}) = \lambda\)
\[ y \sim \mathrm{Poisson}(u\lambda) \]
\[ \begin{aligned} \mu & = \exp(a + \mathbf{BX}) \\ y & \sim \mathrm{NB}(\mu, \phi) \end{aligned} \]
\(\phi\) is called the dispersion parameter, and is related to the variance:
\[ \sigma^2_{y|\mathbf{X}} = \mu + \frac{\mu^2}{\phi} \]
\[ \begin{aligned} \mu & = \mathrm{logit}^{-1}(a_\rho + \mathbf{B_\rho X_\rho}) \\ \phi & = \exp(a_\phi + \mathbf{B_\phi X_\phi}) & \mathrm{\small optionally} \\ \alpha & = \mu\phi \\ \beta & = (1 - \mu)\phi \\ y & \sim \mathrm{Beta}(\alpha, \beta) \end{aligned} \]
\[ \begin{aligned} \mu & = \frac{1}{(a + \mathbf{B X})} & OR \\ \mu & = \exp(a + \mathbf{B X}) \\ \\ \alpha & = \frac{\mu^2}{\phi} \\ \beta & = \frac{\mu}{\phi} \\ y & \sim \mathrm{Gamma}(\alpha, \beta) \end{aligned} \]
real <lower=0> sigma)real <lower=0> sigma)For this exercise, you will use the birddiv
(in docs/exercises/data/birddiv.csv) dataset; you can load
it directly from github using data.table::fread(). Bird
diversity was measured in 1-km^2 plots in multiple countries of Europe,
investigating the effects of habitat fragmentation and productivity on
diversity. We will consider a subset of the data. Specificially, we will
ask how various covariates are associated with the diversity of birds
specializing on different habitat types. The data have the following
potential predictors:
All of the above variables are standardized to a 0-100 scale. Consider this when choosing priors.
Your response variable will be richness, the bird
species richness in the plot. Additionally, you have an indicator
variable hab_type. This is not telling you what habitat
type was sampled (plots included multiple habitats). Rather, this is
telling you what type of bird species were counted for the richness
measurement: so hab_type == "forest" & richness == 7
indicates that 7 forest specialists were observed in that plot.
Build one or more generalised linear models for bird richness. Your task should be to describe two things: (1) how does richness vary with climate, productivity, fragmentation, or habitat diversity, and (2) do these relationships vary depending on what habitat bird species specialize on.
data {
int <lower=0> n; // number of data points
int <lower=0> k; // number of x-variables
int <lower=0> richness [n];
matrix [n,k] X;
}
parameters {
real a;
vector [k] B;
}
transformed parameters {
vector <lower=0> [n] lambda;
lambda = exp(a + X * B);
}
model {
richness ~ poisson(lambda);
a ~ normal(0, 10);
B ~ normal(0, 5);
}
generated quantities {
int r_predict [n];
for(i in 1:n)
r_predict[i] = poisson_rng(lambda[i]);
r_predict = poisson_rng(lambda);
}library(data.table)
birds = fread("exercises/data/birddiv.csv")
# stan can't handle NAs
birds = birds[complete.cases(birds)]
# original variables scaled from 0-100; rescale to go from 0 to 1
X_scaled = as.matrix(birds[,c(2:7)])/100
# I will try two models
# First, simple model, only forest birds in relation to forest cover
for_i = which(birds$hab_type == "forest")
standat1 = list(
n = length(for_i),
k = 1,
richness = birds$richness[for_i],
X = X_scaled[for_i, "NDVI", drop=FALSE])
fit1 = sampling(bird_glm, data = standat1, iter=2000,
chains = 4, refresh = 0)# Second, looking at how two variables influence birds of different types
# grab two variables
X = X_scaled[, c("For.cover", "NDVI")]
# add a categorical variable for bird type
X = cbind(X, open=ifelse(birds$hab_type == "open",1, 0))
X = cbind(X, generalist=ifelse(birds$hab_type == "generalist",1, 0))
# add interaction terms with the categories
X = cbind(X, op_forCov=X[,1]*X[,3], op_NDVI=X[,1]*X[,4],
ge_forCov=X[,2]*X[,3], ge_NDVI=X[,2]*X[,4])
head(X)
## For.cover NDVI open generalist op_forCov op_NDVI ge_forCov ge_NDVI
## [1,] 0.998675 0.60381356 0 0 0 0 0 0
## [2,] 0.000000 0.22881356 0 0 0 0 0 0
## [3,] 0.382825 0.11864407 0 0 0 0 0 0
## [4,] 0.114125 0.19067797 0 0 0 0 0 0
## [5,] 0.000000 0.02118644 0 0 0 0 0 0
## [6,] 1.000000 0.54025424 0 0 0 0 0 0
standat2 = with(birds, list(
n = length(richness),
k = ncol(X),
richness = richness,
X = X))
fit2 = sampling(bird_glm, data = standat2, iter=2000,
chains = 4, refresh = 0)## Use as.array if you want to keep different mcmc chains separate
## This is ideal for diagnostics
## For inference, you usually want to lump all chains
## In this case, you use as.matrix
samp1_pars = as.array(fit1, pars=c('a', 'B'))
mcmc_combo(samp1_pars, c("hist", "trace"))print(fit1, pars = c('a', 'B'))
## Inference for Stan model: 5c0b16bbee48bcf4a957b09a484d1746.
## 4 chains, each with iter=2000; warmup=1000; thin=1;
## post-warmup draws per chain=1000, total post-warmup draws=4000.
##
## mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
## a 1.55 0.00 0.09 1.38 1.49 1.55 1.61 1.73 757 1.01
## B[1] 0.67 0.01 0.15 0.38 0.57 0.67 0.77 0.95 768 1.01
##
## Samples were drawn using NUTS(diag_e) at Sun Nov 27 10:48:42 2022.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at
## convergence, Rhat=1).# compute posterior distribution of the residual sum of squares
samp1_lam = as.matrix(fit1, pars='lambda')
sq_resid1 = apply(samp1_lam, 1, function(x) (standat1$richness - x)^2)
# compute posterior distribution of dispersion parameter, which is just
# sum(squared residuals)/(n - k)
# here k is 2, we have an intercept and one slope
# if phi > 1, we have overdispersion and need a better model
phi = apply(sq_resid1, 2, function(x) sum(x)/(length(x) - 2))
quantile(phi, c(0.5, 0.05, 0.95))
## 50% 5% 95%
## 16.49676 16.39371 16.84566## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
# compute posterior distribution of the residual sum of squares
samp2_lam = as.matrix(fit2, pars='lambda')
sq_resid2 = apply(samp2_lam, 1, function(x) (standat2$richness - x)^2)
# compute posterior distribution of dispersion parameter, which is just
# sum(squared residuals)/(n - k)
# here k is 2, we have an intercept and one slope
# if phi > 1, we have overdispersion and need a better model
phi = apply(sq_resid2, 2, function(x) sum(x)/(length(x) - 2))
quantile(phi, c(0.5, 0.05, 0.95))
## 50% 5% 95%
## 7.474143 7.381813 7.653988